In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, DBSCAN

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 100)

print("Libraries imported successfully.")

for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
Libraries imported successfully.
input/test.csv
input/train.csv
input/gender_submission.csv
In [2]:
#Обробка даних
df = pd.read_csv("input/train.csv")

display(df.head(10))
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
In [3]:
import ydata_profiling

report = ydata_profiling.ProfileReport(df)

display(report)
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
data = pd.read_csv("input/train.csv",sep=",")
data1 = data[['Survived','Pclass','Age','SibSp','Parch','Fare']]
print(data1.head())
data1.info()
   Survived  Pclass   Age  SibSp  Parch     Fare
0         0       3  22.0      1      0   7.2500
1         1       1  38.0      1      0  71.2833
2         1       3  26.0      0      0   7.9250
3         1       1  35.0      1      0  53.1000
4         0       3  35.0      0      0   8.0500
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 41.9 KB
In [5]:
data2=data1.dropna(axis=0).reset_index(drop=True)
In [6]:
#Кластеризація методом к-середніх
del data2
data2=data1.dropna(axis=0).reset_index(drop=True)
def doKmeans(X, nclust=2):
    model = KMeans(nclust)
    model.fit(X)
    clust_labels = model.predict(X)
    cent = model.cluster_centers_
    return (clust_labels, cent)

clust_labels, cent = doKmeans(data2, 5)
kmeans = pd.DataFrame(clust_labels)
data2.insert((data2.shape[1]),'kmeans',kmeans)
data2.groupby('kmeans').mean()
Out[6]:
Survived Pclass Age SibSp Parch Fare
kmeans
0 0.283224 2.544662 32.125272 0.224401 0.217865 13.108678
1 0.764706 1.000000 31.235294 0.941176 1.352941 231.153676
2 0.686047 1.069767 35.906047 0.627907 0.488372 94.726500
3 1.000000 1.000000 35.333333 0.000000 0.333333 512.329200
4 0.570470 2.127517 18.354027 1.295302 0.953020 34.509536
In [7]:
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(data2['Age'],data2['Fare'],c=kmeans[0],s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel('Age')
ax.set_ylabel('Fare')
plt.colorbar(scatter)
Out[7]:
<matplotlib.colorbar.Colorbar at 0x7467f304f1d0>
In [8]:
#Ієрархічна кластеризація
del data2
data2=data1.dropna(axis=0).reset_index(drop=True)
merg = linkage(data2[0:10], method="ward")
print(merg)
[[  2.           7.           3.9106507    2.        ]
 [  6.           9.           5.30477379   2.        ]
 [  0.          10.           6.15841156   3.        ]
 [  4.          12.          12.34003146   4.        ]
 [  8.          11.          18.2324326    3.        ]
 [  1.           3.          18.42911823   2.        ]
 [  5.          15.          23.5213758    3.        ]
 [ 13.          14.          46.58343356   7.        ]
 [ 16.          17.         102.82744677  10.        ]]
In [9]:
fig = plt.figure()
ax = fig.add_subplot(111)
dendrogram(merg, leaf_rotation = 0)
ax.set_title('Hierarchical Clustering')
ax.set_xlabel('data points')
ax.set_ylabel('euclidean distance')
Out[9]:
Text(0, 0.5, 'euclidean distance')
In [10]:
data2[0:10]
Out[10]:
Survived Pclass Age SibSp Parch Fare
0 0 3 22.0 1 0 7.2500
1 1 1 38.0 1 0 71.2833
2 1 3 26.0 0 0 7.9250
3 1 1 35.0 1 0 53.1000
4 0 3 35.0 0 0 8.0500
5 0 1 54.0 0 0 51.8625
6 0 3 2.0 3 1 21.0750
7 1 3 27.0 0 2 11.1333
8 1 2 14.0 1 0 30.0708
9 1 3 4.0 1 1 16.7000
In [11]:
#Агломеративне кластерування 
def doAgglomerative(X, nclust=2):
    model = AgglomerativeClustering(n_clusters=nclust, metric = 'euclidean', linkage = 'ward')
    clust_labels1 = model.fit_predict(X)
    return (clust_labels1)

clust_labels1 = doAgglomerative(data2, 5)
agglomerative = pd.DataFrame(clust_labels1)
data2.insert((data2.shape[1]),'agglomerative',agglomerative)
In [12]:
data2.groupby('agglomerative').mean()
Out[12]:
Survived Pclass Age SibSp Parch Fare
agglomerative
0 0.334507 2.514085 28.484595 0.448944 0.394366 15.877105
1 0.733333 1.000000 32.430667 0.600000 0.866667 131.183883
2 0.653061 1.234694 35.632653 0.795918 0.377551 68.176576
3 1.000000 1.000000 35.333333 0.000000 0.333333 512.329200
4 0.733333 1.000000 30.333333 1.000000 1.333333 239.991940
In [13]:
%matplotlib inline
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(data2['Age'],data2['Fare'],c=agglomerative[0],s=50)
ax.set_title('Agglomerative Clustering')
ax.set_xlabel('Age')
ax.set_ylabel('Fare')
plt.colorbar(scatter)
Out[13]:
<matplotlib.colorbar.Colorbar at 0x7467f2a83290>
No description has been provided for this image
In [14]:
plt.show()
In [15]:
#DBSCAN
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X = preprocessor.fit_transform(data[features])

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

eps = 0.5 
min_samples = 5 

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
clusters = dbscan.fit_predict(X_pca)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', s=50, alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('DBSCAN Clustering for Titanic Data')
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]: